{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright (c) Meta Platforms, Inc. and affiliates.\n",
    "This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.\n",
    "\n",
    "Use this notebook to pull in datasets and apply pre-processing.  Most grammar datasets unfortunately require preprocessing before being usable in training. (example - jfleg has 4 targets per input, so we have to rematch as 1:1 pairings) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],

   "source": [
    "import csv\n",
    "from datasets import load_metric, load_dataset\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_replacements = [\n",
    "  (\" .\", \".\"), \n",
    "  (\" ,\", \",\"),\n",
    "  (\" '\", \"'\"),\n",
    "  (\" ?\", \"?\"),\n",
    "  (\" !\", \"!\"),\n",
    "  (\" :\", \":\"),\n",
    "  (\" ;\", \";\"),\n",
    "  (\" n't\", \"n't\"),\n",
    "  (\" v\", \"v\"),\n",
    "  (\"2 0 0 6\", \"2006\"),\n",
    "  (\"5 5\", \"55\"),\n",
    "  (\"4 0 0\", \"400\"),\n",
    "  (\"1 7-5 0\", \"1750\"),\n",
    "  (\"2 0 %\", \"20%\"),\n",
    "  (\"5 0\", \"50\"),\n",
    "  (\"1 2\", \"12\"),\n",
    "  (\"1 0\", \"10\"),\n",
    "  ('\" ballast water', '\"ballast water')\n",
    "  ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def correct_spacing(item):\n",
    "    \"\"\" we iterate through the list of all replacements per each item in dataset\"\"\"\n",
    "    for fix in list_replacements:\n",
    "        item = item.replace(fix[0], fix[1])\n",
    "    return item\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_csv(csv_path, dataset):\n",
    "    \"\"\" apply spacing corrections and save out matched pairs to csv file as dataset\"\"\"\n",
    "    with open(csv_path, 'w', newline='') as csvfile:\n",
    "        writer = csv.writer(csvfile)\n",
    "        writer.writerow([\"input\", \"target\"])\n",
    "        for case in dataset:\n",
    "     \t    # Adding the t5 task indication prefix to input \n",
  
    "            input_text = case[\"sentence\"]\n",

    "            input_text = correct_spacing(input_text)\n",
    "\n",
    "            for correction in case[\"corrections\"]:\n",
    "              correction = correct_spacing(correction)\n",
    "              # a few of the cases contain blank strings. \n",
    "              if input_text and correction:\n",
    "                writer.writerow([input_text, correction])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In Jfleg  - validation will be used as 'train', test will be 'validation'"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 5,

   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [

      "Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b)\n",
      "Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b)\n"

     ]
    }
   ],
   "source": [
    "train_dataset = load_dataset(\"jfleg\", split='validation[:]') \n",
    "eval_dataset = load_dataset(\"jfleg\", split='test[:]')\n"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 6,

   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['sentence', 'corrections'],\n",
      "    num_rows: 755\n",
      "})\n",
      "Dataset({\n",
      "    features: ['sentence', 'corrections'],\n",
      "    num_rows: 748\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "print(train_dataset)\n",
    "print(eval_dataset)\n"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 7,

   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas . \n",
      "['Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . ']\n"
     ]
    }
   ],
   "source": [
    "print(train_dataset['sentence'][22])\n",
    "print(train_dataset['corrections'][22])"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 8,

   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas. '"
      ]
     },

     "execution_count": 8,

     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean22 = correct_spacing(train_dataset['sentence'][22])\n",
    "clean22"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 9,

   "metadata": {},
   "outputs": [],
   "source": [
    "jfleg_dir = Path.cwd()/'jfleg_dataset'  # if you only use 'jfleg', hf will try and use that and complain\n",
    "jfleg_dir.mkdir(parents=True,exist_ok=True)\n",
    "c4_dir = Path.cwd()/'c4_dataset'\n",
    "c4_dir.mkdir(parents=True,exist_ok=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Process Jfleg data  "
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 10,

   "metadata": {},
   "outputs": [],
   "source": [
    "j_train_file = jfleg_dir/'jtrain.csv'\n",
    "j_eval_file = jfleg_dir/'jeval.csv'"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 11,

   "metadata": {},
   "outputs": [],
   "source": [
    "generate_csv(j_train_file, train_dataset)"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 12,

   "metadata": {},
   "outputs": [],
   "source": [
    "generate_csv(j_eval_file, eval_dataset)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Process C4_200M (!) - we'll pull 10K to start"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 13,

   "metadata": {},
   "outputs": [],
   "source": [
    "c4_dataset = load_dataset(\"liweili/c4_200m\", streaming = True)"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 14,

   "metadata": {},
   "outputs": [],
   "source": [
    "iterator = iter(c4_dataset['train'])"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 15,

   "metadata": {},
   "outputs": [],
   "source": [
    "def c4_generate_csv(csv_path, iterator, num_examples):\n",
    "    with open(csv_path, 'w', newline='') as csvfile:\n",
    "        writer = csv.writer(csvfile)\n",
    "        writer.writerow([\"input\", \"target\"])\n",
    "        for i in range(0,num_examples):\n",
    "          data = next(iterator)\n",

    "          input_text = data[\"input\"]\n",

    "          input_text = correct_spacing(input_text)\n",
    "          correction = correct_spacing(data[\"output\"])\n",
    "          if input_text and correction:\n",
    "            writer.writerow([input_text, correction])"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 16,

   "metadata": {},
   "outputs": [],
   "source": [
    "c4_dir = Path.cwd()/'c4_dataset'\n",
    "c4_dir.mkdir(parents=True,exist_ok=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can modify the following to make the csv file with desired number of instances, here we go for 10k to make a quick test"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 17,

   "metadata": {},
   "outputs": [],
   "source": [
    "c4_filename = c4_dir/'c4train_10k.csv'"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 18,

   "metadata": {},
   "outputs": [],
   "source": [
    "c4_generate_csv(c4_filename, iterator, num_examples=10000)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a single training file by combining jtrain and c4train"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 19,

   "metadata": {},
   "outputs": [],
   "source": [
    "merge_list = [j_train_file, c4_filename, ]"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 20,

   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 21,

   "metadata": {},
   "outputs": [],
   "source": [
    "combined_csv = pd.concat([pd.read_csv(fn) for fn in merge_list])\n"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 22,

   "metadata": {},
   "outputs": [],
   "source": [
    "merged_name = \"gtrain_10k.csv\""
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 23,

   "metadata": {},
   "outputs": [],
   "source": [
    "combined_csv.to_csv(merged_name, index=False, encoding = 'utf-8-sig', )"
   ]
  },
  {
   "cell_type": "code",

   "execution_count": 24,

   "metadata": {},
   "outputs": [],
   "source": [
    "eval_name = \"grammar_validation.csv\""
   ]

  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_csv = pd.read_csv(j_eval_file)\n",
    "eval_csv.to_csv(eval_name, index=False, encoding = 'utf-8-sig', )"
   ]

  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "5b2c14c5f2a3b21e6c2412c8196f5145870350e81c0b737cae3e5c60eb1e1eac"
  },
  "kernelspec": {

   "display_name": "Python 3 (ipykernel)",

   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"

  }
 },
 "nbformat": 4,
 "nbformat_minor": 4

}
